@howuse/electron-crawler 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +11 -11
- package/dist/index.mjs +149 -129
- package/dist/newsCrawler.d.ts +6 -0
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -1,16 +1,16 @@
|
|
|
1
|
-
"use strict";Object.defineProperty(exports,Symbol.toStringTag,{value:"Module"});const
|
|
2
|
-
`:""});
|
|
3
|
-
`});
|
|
1
|
+
"use strict";Object.defineProperty(exports,Symbol.toStringTag,{value:"Module"});const A=require("electron-store"),x=require("electron"),O=require("turndown");function v(){const t=x.BrowserWindow.getAllWindows().find(r=>r.title==="crawler-hidden-window");return t||new x.BrowserWindow({show:!1,webPreferences:{sandbox:!1},title:"crawler-hidden-window"})}const I=new O({headingStyle:"atx",codeBlockStyle:"fenced",bulletListMarker:"-",emDelimiter:"*",strongDelimiter:"**",linkStyle:"inlined",linkReferenceStyle:"full",preformattedCode:!1,blankReplacement:(t,e)=>e.nodeName==="BR"?`
|
|
2
|
+
`:""});I.addRule("preserveLineBreaks",{filter:["br"],replacement:()=>`
|
|
3
|
+
`});I.addRule("images",{filter:"img",replacement:(t,e)=>{const r=e.alt||"",s=e.src||e.getAttribute("src")||"",o=e.title||"";return o?``:``}});function b(t){return t.replace(/<script[\s\S]*?<\/script>/gi,"").replace(/<style[\s\S]*?<\/style>/gi,"").replace(/<noscript[\s\S]*?<\/noscript>/gi,"").replace(/<iframe[\s\S]*?<\/iframe>/gi,"").replace(/on\w+="[^"]*"/gi,"").replace(/on\w+='[^']*'/gi,"").trim()}function D(t){if(!t||!t.trim())return"";try{const e=b(t);if(!e)return"";let r=I.turndown(e);return r=r.replace(/\n{3,}/g,`
|
|
4
4
|
|
|
5
5
|
`),r=r.split(`
|
|
6
6
|
`).map(s=>s.trimEnd()).join(`
|
|
7
|
-
`),r.trim()}catch(
|
|
7
|
+
`),r.trim()}catch(e){return console.error("[normalizeMarkdown] 转换失败:",e),b(t).replace(/<[^>]+>/g,"").replace(/\n{3,}/g,`
|
|
8
8
|
|
|
9
|
-
`).trim()}}function
|
|
9
|
+
`).trim()}}function L(t){const e=new Date,r=e.getFullYear(),s=e.getMonth()+1,o=e.getDate(),a=e.getHours(),i=e.getMinutes();if(!t||!t.trim())return e.toISOString();const l=t.trim(),y=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]+(\d{1,2}):(\d{1,2})\s+(?:星期|周)[一二三四五六日天]/,S=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/,R=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日]?/,U=/(\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/,T=/(\d{1,2})[月\-/](\d{1,2})[日]?/,M=/(\d{1,2})[:时](\d{1,2})[分]?/;let g=r,u=s,f=o,d=a,p=i,n=l.match(y);if(n)g=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10),d=parseInt(n[4],10),p=parseInt(n[5],10);else if(n=l.match(S),n)g=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10),d=parseInt(n[4],10),p=parseInt(n[5],10);else if(n=l.match(R),n)g=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10);else if(n=l.match(U),n)u=parseInt(n[1],10),f=parseInt(n[2],10),d=parseInt(n[3],10),p=parseInt(n[4],10);else if(n=l.match(T),n)u=parseInt(n[1],10),f=parseInt(n[2],10);else if(n=l.match(M),n)d=parseInt(n[1],10),p=parseInt(n[2],10);else{const k=new Date(l);return Number.isNaN(k.getTime())?e.toISOString():k.toISOString()}(u<1||u>12)&&(u=s),(f<1||f>31)&&(f=o),(d<0||d>23)&&(d=a),(p<0||p>59)&&(p=i);const h=new Date(g,u-1,f,d,p,0,0);return h.getFullYear()!==g||h.getMonth()!==u-1||h.getDate()!==f?e.toISOString():C(h)}function C(t){const e=S=>S.toString().padStart(2,"0"),r=t.getFullYear(),s=e(t.getMonth()+1),o=e(t.getDate()),a=e(t.getHours()),i=e(t.getMinutes()),l=e(t.getSeconds()),y=t.getMilliseconds().toString().padStart(3,"0");return`${r}-${s}-${o}T${a}:${i}:${l}.${y}`}const P=A.default||A,w=new P;async function E(){if(c.rules&&c.rules.length>0)return c.rules;if(c.devMode)return await $();const e=w.get("news.rules");let r=[];if(Array.isArray(e))r=e;else if(e&&Array.isArray(e.rules)){const o=e.updatedAt?new Date(e.updatedAt).getTime():0,a=Date.now();o>0&&a-o<=18e6?r=e.rules:w.delete("news.rules")}if(r.length>0)return r;const s=await $();return s.length>0?(c.devMode||w.set("news.rules",{rules:s,updatedAt:new Date().toISOString()}),s):[]}const m={running:!1,interval_ms:1800*1e3};let c={rules:[],rulesApiUrl:void 0,pushApiUrl:void 0,devMode:!1,ruleTransformer:t=>t,newsItemFieldMap:void 0},_=!1;function H(t){c={...c,...t,ruleTransformer:t.ruleTransformer||(e=>e&&typeof e=="object"&&"data"in e?e.data:e)},j()}async function $(){if(!c.rulesApiUrl)return[];try{const t=await fetch(c.rulesApiUrl);if(!t.ok)throw new Error(`Failed to fetch rules from API: ${t.status} ${t.statusText}`);const e=await t.json(),r=c.ruleTransformer(e);return Array.isArray(r)?r:(console.warn("[crawler] Rules API returned non-array data, using empty array instead"),[])}catch(t){return console.error("[crawler] Failed to fetch rules from API:",t),[]}}async function N(t){if(!c.pushApiUrl)return;const e=w.get("news.pushedUrls")||[];if(e.includes(t.url)){console.log(`[crawler] URL already pushed, skipping: ${t.url}`);return}try{const r=F(t),s=await fetch(c.pushApiUrl,{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify(r)});if(!s.ok)throw new Error(`Failed to push results to API: ${s.status} ${s.statusText}`);const o=[...e,t.url];w.set("news.pushedUrls",o),console.log("[crawler] Results pushed to API successfully")}catch(r){console.error("[crawler] Failed to push results to API:",r)}}function F(t){const e=c.newsItemFieldMap;if(!e||Object.keys(e).length===0)return t;const r={},s=Object.entries(t);for(const[o,a]of s){const i=e[o];if(i==="-")continue;const l=typeof i=="string"?i:o;r[l]=a}return r}async function q(t,e){return await t.webContents.executeJavaScript(`
|
|
10
10
|
(() => {
|
|
11
11
|
const links = []
|
|
12
12
|
// 在指定范围内查找所有链接
|
|
13
|
-
const rangeElements = document.querySelectorAll(${JSON.stringify(
|
|
13
|
+
const rangeElements = document.querySelectorAll(${JSON.stringify(e.home_range_selector)})
|
|
14
14
|
if (rangeElements.length === 0) {
|
|
15
15
|
// 如果没有找到范围,则在整个文档中查找
|
|
16
16
|
const allLinks = document.querySelectorAll('a')
|
|
@@ -39,7 +39,7 @@
|
|
|
39
39
|
const uniqueLinks = [...new Map(links.map(l => [l.href, l])).values()]
|
|
40
40
|
return uniqueLinks.map(l => l.href)
|
|
41
41
|
})()
|
|
42
|
-
`)}async function W(e,
|
|
42
|
+
`)}async function W(t,e,r){try{return await t.loadURL(r,{httpReferrer:e.base_url}),await t.webContents.executeJavaScript(`
|
|
43
43
|
(() => {
|
|
44
44
|
const pickText = (sel) => {
|
|
45
45
|
const el = document.querySelector(sel)
|
|
@@ -77,10 +77,10 @@
|
|
|
77
77
|
return clone.innerHTML || ''
|
|
78
78
|
}
|
|
79
79
|
return {
|
|
80
|
-
title: pickText(${JSON.stringify(
|
|
81
|
-
contentHtml: pickContentHtml(${JSON.stringify(
|
|
82
|
-
timeText: pickText(${JSON.stringify(
|
|
80
|
+
title: pickText(${JSON.stringify(e.title_selector)}),
|
|
81
|
+
contentHtml: pickContentHtml(${JSON.stringify(e.content_selector)}, ${JSON.stringify(e.exclude_selectors||[])}),
|
|
82
|
+
timeText: pickText(${JSON.stringify(e.time_selector)}),
|
|
83
83
|
url: location.href
|
|
84
84
|
}
|
|
85
85
|
})()
|
|
86
|
-
`)}catch(s){return console.warn("[crawler] failed to extract page content",r,s),null}}async function J(
|
|
86
|
+
`)}catch(s){return console.warn("[crawler] failed to extract page content",r,s),null}}async function J(t){const e=v();try{await e.loadURL(t.base_url,{httpReferrer:t.base_url});const r=await q(e,t);console.log(`[crawler] found ${r.length} links from ${t.remark||t.base_url}`);const s=[];for(const o of r){const a=await W(e,t,o);if(!a||!a.title||!a.contentHtml){console.log(`[crawler] skip empty result for ${o}`);continue}const i={url:a.url||o,title:a.title,content_html:a.contentHtml,content_markdown:D(a.contentHtml),published_at:L(a.timeText)};s.push(i);try{await N(i)}catch(l){console.warn("[crawler] push single news item failed",l)}}return console.log(`[crawler] processed ${s.length} items from ${t.remark||t.base_url}`),{success:!0,data:s}}catch(r){return console.warn("[crawler] rule failed",t.remark||t.base_url,r),{success:!1,error:r instanceof Error?r.message:String(r)}}}function j(){if(_)return;_=!0;const t=m.interval_ms,e=async()=>{const r=await E();console.log(`[crawler] scheduled run, rules=${r.length}`),m.running=!0,m.running_source=void 0;try{for(const s of r)await J(s)}finally{m.running=!1,m.running_source=void 0,m.next_run_at=new Date(Date.now()+t).toISOString()}};m.next_run_at=new Date(Date.now()+5e3).toISOString(),setTimeout(e,5e3),setInterval(e,t)}exports.initCrawler=H;
|
package/dist/index.mjs
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import
|
|
1
|
+
import A from "electron-store";
|
|
2
2
|
import { BrowserWindow as x } from "electron";
|
|
3
|
-
import
|
|
3
|
+
import O from "turndown";
|
|
4
4
|
function D() {
|
|
5
|
-
const
|
|
6
|
-
return
|
|
5
|
+
const t = x.getAllWindows().find((r) => r.title === "crawler-hidden-window");
|
|
6
|
+
return t || new x({
|
|
7
7
|
show: !1,
|
|
8
8
|
webPreferences: {
|
|
9
9
|
sandbox: !1
|
|
@@ -11,7 +11,7 @@ function D() {
|
|
|
11
11
|
title: "crawler-hidden-window"
|
|
12
12
|
});
|
|
13
13
|
}
|
|
14
|
-
const
|
|
14
|
+
const I = new O({
|
|
15
15
|
headingStyle: "atx",
|
|
16
16
|
// 使用 # 格式的标题
|
|
17
17
|
codeBlockStyle: "fenced",
|
|
@@ -28,152 +28,172 @@ const S = new U({
|
|
|
28
28
|
// 完整的链接引用格式
|
|
29
29
|
preformattedCode: !1,
|
|
30
30
|
// 不使用预格式化代码
|
|
31
|
-
blankReplacement: (
|
|
31
|
+
blankReplacement: (t, e) => e.nodeName === "BR" ? `
|
|
32
32
|
` : ""
|
|
33
33
|
});
|
|
34
|
-
|
|
34
|
+
I.addRule("preserveLineBreaks", {
|
|
35
35
|
filter: ["br"],
|
|
36
36
|
replacement: () => `
|
|
37
37
|
`
|
|
38
38
|
});
|
|
39
|
-
|
|
39
|
+
I.addRule("images", {
|
|
40
40
|
filter: "img",
|
|
41
|
-
replacement: (
|
|
42
|
-
const r =
|
|
43
|
-
return
|
|
41
|
+
replacement: (t, e) => {
|
|
42
|
+
const r = e.alt || "", s = e.src || e.getAttribute("src") || "", o = e.title || "";
|
|
43
|
+
return o ? `` : ``;
|
|
44
44
|
}
|
|
45
45
|
});
|
|
46
|
-
function
|
|
47
|
-
return
|
|
46
|
+
function _(t) {
|
|
47
|
+
return t.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "").replace(/<noscript[\s\S]*?<\/noscript>/gi, "").replace(/<iframe[\s\S]*?<\/iframe>/gi, "").replace(/on\w+="[^"]*"/gi, "").replace(/on\w+='[^']*'/gi, "").trim();
|
|
48
48
|
}
|
|
49
|
-
function
|
|
50
|
-
if (!
|
|
49
|
+
function v(t) {
|
|
50
|
+
if (!t || !t.trim())
|
|
51
51
|
return "";
|
|
52
52
|
try {
|
|
53
|
-
const
|
|
54
|
-
if (!
|
|
53
|
+
const e = _(t);
|
|
54
|
+
if (!e)
|
|
55
55
|
return "";
|
|
56
|
-
let r =
|
|
56
|
+
let r = I.turndown(e);
|
|
57
57
|
return r = r.replace(/\n{3,}/g, `
|
|
58
58
|
|
|
59
59
|
`), r = r.split(`
|
|
60
60
|
`).map((s) => s.trimEnd()).join(`
|
|
61
61
|
`), r.trim();
|
|
62
|
-
} catch (
|
|
63
|
-
return console.error("[normalizeMarkdown] 转换失败:",
|
|
62
|
+
} catch (e) {
|
|
63
|
+
return console.error("[normalizeMarkdown] 转换失败:", e), _(t).replace(/<[^>]+>/g, "").replace(/\n{3,}/g, `
|
|
64
64
|
|
|
65
65
|
`).trim();
|
|
66
66
|
}
|
|
67
67
|
}
|
|
68
|
-
function
|
|
69
|
-
const
|
|
70
|
-
if (!
|
|
71
|
-
return
|
|
72
|
-
const
|
|
73
|
-
let g = r, u = s, f =
|
|
68
|
+
function L(t) {
|
|
69
|
+
const e = /* @__PURE__ */ new Date(), r = e.getFullYear(), s = e.getMonth() + 1, o = e.getDate(), a = e.getHours(), i = e.getMinutes();
|
|
70
|
+
if (!t || !t.trim())
|
|
71
|
+
return e.toISOString();
|
|
72
|
+
const l = t.trim(), y = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]+(\d{1,2}):(\d{1,2})\s+(?:星期|周)[一二三四五六日天]/, S = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/, R = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日]?/, U = /(\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/, T = /(\d{1,2})[月\-/](\d{1,2})[日]?/, M = /(\d{1,2})[:时](\d{1,2})[分]?/;
|
|
73
|
+
let g = r, u = s, f = o, d = a, m = i, n = l.match(y);
|
|
74
74
|
if (n)
|
|
75
|
-
g = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10),
|
|
76
|
-
else if (n =
|
|
77
|
-
g = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10),
|
|
78
|
-
else if (n =
|
|
75
|
+
g = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10), d = parseInt(n[4], 10), m = parseInt(n[5], 10);
|
|
76
|
+
else if (n = l.match(S), n)
|
|
77
|
+
g = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10), d = parseInt(n[4], 10), m = parseInt(n[5], 10);
|
|
78
|
+
else if (n = l.match(R), n)
|
|
79
79
|
g = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10);
|
|
80
|
-
else if (n =
|
|
81
|
-
u = parseInt(n[1], 10), f = parseInt(n[2], 10),
|
|
82
|
-
else if (n =
|
|
80
|
+
else if (n = l.match(U), n)
|
|
81
|
+
u = parseInt(n[1], 10), f = parseInt(n[2], 10), d = parseInt(n[3], 10), m = parseInt(n[4], 10);
|
|
82
|
+
else if (n = l.match(T), n)
|
|
83
83
|
u = parseInt(n[1], 10), f = parseInt(n[2], 10);
|
|
84
|
-
else if (n =
|
|
85
|
-
|
|
84
|
+
else if (n = l.match(M), n)
|
|
85
|
+
d = parseInt(n[1], 10), m = parseInt(n[2], 10);
|
|
86
86
|
else {
|
|
87
|
-
const
|
|
88
|
-
return Number.isNaN(
|
|
87
|
+
const k = new Date(l);
|
|
88
|
+
return Number.isNaN(k.getTime()) ? e.toISOString() : k.toISOString();
|
|
89
89
|
}
|
|
90
|
-
(u < 1 || u > 12) && (u = s), (f < 1 || f > 31) && (f =
|
|
91
|
-
const h = new Date(g, u - 1, f,
|
|
92
|
-
return h.getFullYear() !== g || h.getMonth() !== u - 1 || h.getDate() !== f ?
|
|
90
|
+
(u < 1 || u > 12) && (u = s), (f < 1 || f > 31) && (f = o), (d < 0 || d > 23) && (d = a), (m < 0 || m > 59) && (m = i);
|
|
91
|
+
const h = new Date(g, u - 1, f, d, m, 0, 0);
|
|
92
|
+
return h.getFullYear() !== g || h.getMonth() !== u - 1 || h.getDate() !== f ? e.toISOString() : C(h);
|
|
93
93
|
}
|
|
94
|
-
function C(
|
|
95
|
-
const
|
|
96
|
-
return `${r}-${s}-${
|
|
94
|
+
function C(t) {
|
|
95
|
+
const e = (S) => S.toString().padStart(2, "0"), r = t.getFullYear(), s = e(t.getMonth() + 1), o = e(t.getDate()), a = e(t.getHours()), i = e(t.getMinutes()), l = e(t.getSeconds()), y = t.getMilliseconds().toString().padStart(3, "0");
|
|
96
|
+
return `${r}-${s}-${o}T${a}:${i}:${l}.${y}`;
|
|
97
97
|
}
|
|
98
|
-
const
|
|
99
|
-
async function
|
|
100
|
-
if (
|
|
101
|
-
return
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
98
|
+
const E = A.default || A, w = new E();
|
|
99
|
+
async function H() {
|
|
100
|
+
if (c.rules && c.rules.length > 0)
|
|
101
|
+
return c.rules;
|
|
102
|
+
if (c.devMode)
|
|
103
|
+
return await b();
|
|
104
|
+
const e = w.get("news.rules");
|
|
105
|
+
let r = [];
|
|
106
|
+
if (Array.isArray(e))
|
|
107
|
+
r = e;
|
|
108
|
+
else if (e && Array.isArray(e.rules)) {
|
|
109
|
+
const o = e.updatedAt ? new Date(e.updatedAt).getTime() : 0, a = Date.now();
|
|
110
|
+
o > 0 && a - o <= 18e6 ? r = e.rules : w.delete("news.rules");
|
|
111
|
+
}
|
|
112
|
+
if (r.length > 0)
|
|
113
|
+
return r;
|
|
114
|
+
const s = await b();
|
|
115
|
+
return s.length > 0 ? (c.devMode || w.set("news.rules", {
|
|
116
|
+
rules: s,
|
|
117
|
+
updatedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
118
|
+
}), s) : [];
|
|
107
119
|
}
|
|
108
|
-
const
|
|
120
|
+
const p = {
|
|
109
121
|
running: !1,
|
|
110
122
|
interval_ms: 1800 * 1e3
|
|
111
123
|
};
|
|
112
|
-
let
|
|
124
|
+
let c = {
|
|
113
125
|
rules: [],
|
|
114
126
|
rulesApiUrl: void 0,
|
|
115
127
|
pushApiUrl: void 0,
|
|
116
|
-
|
|
128
|
+
devMode: !1,
|
|
129
|
+
ruleTransformer: (t) => t,
|
|
117
130
|
newsItemFieldMap: void 0
|
|
118
131
|
}, $ = !1;
|
|
119
|
-
function z(
|
|
120
|
-
|
|
121
|
-
...
|
|
122
|
-
...
|
|
132
|
+
function z(t) {
|
|
133
|
+
c = {
|
|
134
|
+
...c,
|
|
135
|
+
...t,
|
|
123
136
|
// 确保ruleTransformer始终存在
|
|
124
|
-
ruleTransformer:
|
|
137
|
+
ruleTransformer: t.ruleTransformer || ((e) => e && typeof e == "object" && "data" in e ? e.data : e)
|
|
125
138
|
}, W();
|
|
126
139
|
}
|
|
127
|
-
async function
|
|
128
|
-
if (!
|
|
140
|
+
async function b() {
|
|
141
|
+
if (!c.rulesApiUrl)
|
|
129
142
|
return [];
|
|
130
143
|
try {
|
|
131
|
-
const
|
|
132
|
-
if (!
|
|
133
|
-
throw new Error(`Failed to fetch rules from API: ${
|
|
134
|
-
const
|
|
144
|
+
const t = await fetch(c.rulesApiUrl);
|
|
145
|
+
if (!t.ok)
|
|
146
|
+
throw new Error(`Failed to fetch rules from API: ${t.status} ${t.statusText}`);
|
|
147
|
+
const e = await t.json(), r = c.ruleTransformer(e);
|
|
135
148
|
return Array.isArray(r) ? r : (console.warn("[crawler] Rules API returned non-array data, using empty array instead"), []);
|
|
136
|
-
} catch (
|
|
137
|
-
return console.error("[crawler] Failed to fetch rules from API:",
|
|
149
|
+
} catch (t) {
|
|
150
|
+
return console.error("[crawler] Failed to fetch rules from API:", t), [];
|
|
138
151
|
}
|
|
139
152
|
}
|
|
140
|
-
async function
|
|
141
|
-
if (
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
}
|
|
153
|
+
async function P(t) {
|
|
154
|
+
if (!c.pushApiUrl)
|
|
155
|
+
return;
|
|
156
|
+
const e = w.get("news.pushedUrls") || [];
|
|
157
|
+
if (e.includes(t.url)) {
|
|
158
|
+
console.log(`[crawler] URL already pushed, skipping: ${t.url}`);
|
|
159
|
+
return;
|
|
160
|
+
}
|
|
161
|
+
try {
|
|
162
|
+
const r = N(t), s = await fetch(c.pushApiUrl, {
|
|
163
|
+
method: "POST",
|
|
164
|
+
headers: {
|
|
165
|
+
"Content-Type": "application/json"
|
|
166
|
+
},
|
|
167
|
+
body: JSON.stringify(r)
|
|
168
|
+
});
|
|
169
|
+
if (!s.ok)
|
|
170
|
+
throw new Error(`Failed to push results to API: ${s.status} ${s.statusText}`);
|
|
171
|
+
const o = [...e, t.url];
|
|
172
|
+
w.set("news.pushedUrls", o), console.log("[crawler] Results pushed to API successfully");
|
|
173
|
+
} catch (r) {
|
|
174
|
+
console.error("[crawler] Failed to push results to API:", r);
|
|
175
|
+
}
|
|
156
176
|
}
|
|
157
|
-
function N(
|
|
158
|
-
const
|
|
159
|
-
if (!
|
|
160
|
-
return
|
|
161
|
-
const r = {}, s = Object.entries(
|
|
162
|
-
for (const [
|
|
163
|
-
const
|
|
164
|
-
if (
|
|
165
|
-
const
|
|
166
|
-
r[
|
|
177
|
+
function N(t) {
|
|
178
|
+
const e = c.newsItemFieldMap;
|
|
179
|
+
if (!e || Object.keys(e).length === 0)
|
|
180
|
+
return t;
|
|
181
|
+
const r = {}, s = Object.entries(t);
|
|
182
|
+
for (const [o, a] of s) {
|
|
183
|
+
const i = e[o];
|
|
184
|
+
if (i === "-") continue;
|
|
185
|
+
const l = typeof i == "string" ? i : o;
|
|
186
|
+
r[l] = a;
|
|
167
187
|
}
|
|
168
188
|
return r;
|
|
169
189
|
}
|
|
170
|
-
async function F(
|
|
171
|
-
return await
|
|
190
|
+
async function F(t, e) {
|
|
191
|
+
return await t.webContents.executeJavaScript(
|
|
172
192
|
`
|
|
173
193
|
(() => {
|
|
174
194
|
const links = []
|
|
175
195
|
// 在指定范围内查找所有链接
|
|
176
|
-
const rangeElements = document.querySelectorAll(${JSON.stringify(
|
|
196
|
+
const rangeElements = document.querySelectorAll(${JSON.stringify(e.home_range_selector)})
|
|
177
197
|
if (rangeElements.length === 0) {
|
|
178
198
|
// 如果没有找到范围,则在整个文档中查找
|
|
179
199
|
const allLinks = document.querySelectorAll('a')
|
|
@@ -205,9 +225,9 @@ async function F(e, t) {
|
|
|
205
225
|
`
|
|
206
226
|
);
|
|
207
227
|
}
|
|
208
|
-
async function q(
|
|
228
|
+
async function q(t, e, r) {
|
|
209
229
|
try {
|
|
210
|
-
return await
|
|
230
|
+
return await t.loadURL(r, { httpReferrer: e.base_url }), await t.webContents.executeJavaScript(
|
|
211
231
|
`
|
|
212
232
|
(() => {
|
|
213
233
|
const pickText = (sel) => {
|
|
@@ -246,11 +266,11 @@ async function q(e, t, r) {
|
|
|
246
266
|
return clone.innerHTML || ''
|
|
247
267
|
}
|
|
248
268
|
return {
|
|
249
|
-
title: pickText(${JSON.stringify(
|
|
250
|
-
contentHtml: pickContentHtml(${JSON.stringify(
|
|
251
|
-
|
|
269
|
+
title: pickText(${JSON.stringify(e.title_selector)}),
|
|
270
|
+
contentHtml: pickContentHtml(${JSON.stringify(e.content_selector)}, ${JSON.stringify(
|
|
271
|
+
e.exclude_selectors || []
|
|
252
272
|
)}),
|
|
253
|
-
timeText: pickText(${JSON.stringify(
|
|
273
|
+
timeText: pickText(${JSON.stringify(e.time_selector)}),
|
|
254
274
|
url: location.href
|
|
255
275
|
}
|
|
256
276
|
})()
|
|
@@ -260,39 +280,39 @@ async function q(e, t, r) {
|
|
|
260
280
|
return console.warn("[crawler] failed to extract page content", r, s), null;
|
|
261
281
|
}
|
|
262
282
|
}
|
|
263
|
-
async function J(
|
|
264
|
-
const
|
|
283
|
+
async function J(t) {
|
|
284
|
+
const e = D();
|
|
265
285
|
try {
|
|
266
|
-
await
|
|
267
|
-
const r = await F(
|
|
268
|
-
console.log(`[crawler] found ${r.length} links from ${
|
|
286
|
+
await e.loadURL(t.base_url, { httpReferrer: t.base_url });
|
|
287
|
+
const r = await F(e, t);
|
|
288
|
+
console.log(`[crawler] found ${r.length} links from ${t.remark || t.base_url}`);
|
|
269
289
|
const s = [];
|
|
270
|
-
for (const
|
|
271
|
-
const
|
|
272
|
-
if (!
|
|
273
|
-
console.log(`[crawler] skip empty result for ${
|
|
290
|
+
for (const o of r) {
|
|
291
|
+
const a = await q(e, t, o);
|
|
292
|
+
if (!a || !a.title || !a.contentHtml) {
|
|
293
|
+
console.log(`[crawler] skip empty result for ${o}`);
|
|
274
294
|
continue;
|
|
275
295
|
}
|
|
276
|
-
const
|
|
277
|
-
url:
|
|
278
|
-
title:
|
|
279
|
-
content_html:
|
|
280
|
-
content_markdown:
|
|
281
|
-
published_at:
|
|
296
|
+
const i = {
|
|
297
|
+
url: a.url || o,
|
|
298
|
+
title: a.title,
|
|
299
|
+
content_html: a.contentHtml,
|
|
300
|
+
content_markdown: v(a.contentHtml),
|
|
301
|
+
published_at: L(a.timeText)
|
|
282
302
|
};
|
|
283
|
-
s.push(
|
|
303
|
+
s.push(i);
|
|
284
304
|
try {
|
|
285
|
-
await
|
|
286
|
-
} catch (
|
|
287
|
-
console.warn("[crawler] push single news item failed",
|
|
305
|
+
await P(i);
|
|
306
|
+
} catch (l) {
|
|
307
|
+
console.warn("[crawler] push single news item failed", l);
|
|
288
308
|
}
|
|
289
309
|
}
|
|
290
|
-
return console.log(`[crawler] processed ${s.length} items from ${
|
|
310
|
+
return console.log(`[crawler] processed ${s.length} items from ${t.remark || t.base_url}`), {
|
|
291
311
|
success: !0,
|
|
292
312
|
data: s
|
|
293
313
|
};
|
|
294
314
|
} catch (r) {
|
|
295
|
-
return console.warn("[crawler] rule failed",
|
|
315
|
+
return console.warn("[crawler] rule failed", t.remark || t.base_url, r), {
|
|
296
316
|
success: !1,
|
|
297
317
|
error: r instanceof Error ? r.message : String(r)
|
|
298
318
|
};
|
|
@@ -301,17 +321,17 @@ async function J(e) {
|
|
|
301
321
|
function W() {
|
|
302
322
|
if ($) return;
|
|
303
323
|
$ = !0;
|
|
304
|
-
const
|
|
305
|
-
const r = await
|
|
306
|
-
console.log(`[crawler] scheduled run, rules=${r.length}`),
|
|
324
|
+
const t = p.interval_ms, e = async () => {
|
|
325
|
+
const r = await H();
|
|
326
|
+
console.log(`[crawler] scheduled run, rules=${r.length}`), p.running = !0, p.running_source = void 0;
|
|
307
327
|
try {
|
|
308
328
|
for (const s of r)
|
|
309
329
|
await J(s);
|
|
310
330
|
} finally {
|
|
311
|
-
|
|
331
|
+
p.running = !1, p.running_source = void 0, p.next_run_at = new Date(Date.now() + t).toISOString();
|
|
312
332
|
}
|
|
313
333
|
};
|
|
314
|
-
|
|
334
|
+
p.next_run_at = new Date(Date.now() + 5e3).toISOString(), setTimeout(e, 5e3), setInterval(e, t);
|
|
315
335
|
}
|
|
316
336
|
export {
|
|
317
337
|
z as initCrawler
|
package/dist/newsCrawler.d.ts
CHANGED
|
@@ -19,6 +19,12 @@ export type CrawlerConfig = {
|
|
|
19
19
|
rulesApiUrl?: string;
|
|
20
20
|
pushApiUrl?: string;
|
|
21
21
|
ruleTransformer?: (data: any) => any;
|
|
22
|
+
/**
|
|
23
|
+
* 是否处于开发模式
|
|
24
|
+
* - 开发模式:不使用本地缓存,每次都优先使用内存 rules,其次直接从 API 拉取
|
|
25
|
+
* - 生产模式:会使用本地缓存(带 5 小时过期时间)
|
|
26
|
+
*/
|
|
27
|
+
devMode?: boolean;
|
|
22
28
|
newsItemFieldMap?: Partial<Record<keyof NewsItem, string | '-'>>;
|
|
23
29
|
};
|
|
24
30
|
export type NewsItem = {
|